In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
import sklearn
import seaborn as sns

np.random.seed = 42

df=pd.read_csv('../data/players_away_matches.csv')

#removing id and names and saving df without them
df = df.drop(df.columns[0], axis=1)
df.drop(columns=['playerName'], inplace=True)
df.drop(columns=['team_name'], inplace=True)

X_mod, X_val = train_test_split(df, test_size=0.3, random_state=42)

X_val.to_csv('../data/for_validators/soccer_players_validators.csv', index=False)
X_mod.to_csv('../data/for_modellers/soccer_players_modellers.csv', index=False)
df=pd.read_csv('../data/for_modellers/soccer_players_modellers.csv')

In this milestone:¶

  1. Data Preprocessing:
  • removing some columns
  • NULL/missing values (deleting or mean or another strategy)
  • encoding of categorical columns (one hot encoding, ordinal encoding, ...)
  • data transformation
  • scaling/standarizing variables (in which models)
  • outliers
  • function for preprocessing for validators
  1. Pre-modelling

1. Data Preprocessing¶

1.1 Removing columns¶

1.2 Missing values¶

In [254]:
null_columns = df.columns[df.isnull().any()]

# Print the null columns
print("Null columns:")
print(null_columns)
Null columns:
Index(['aerials_lost', 'aerials_won', 'aerials_won_pct', 'assisted_shots',
       'ball_recoveries', 'blocked_passes', 'blocked_shots',
       'blocked_shots_saves', 'blocks', 'carries', 'carries_into_final_third',
       'carries_into_penalty_area', 'carry_distance',
       'carry_progressive_distance', 'clearances', 'corner_kicks',
       'corner_kicks_in', 'corner_kicks_out', 'corner_kicks_straight',
       'crosses_into_penalty_area', 'dispossessed', 'dribble_tackles',
       'dribble_tackles_pct', 'dribbled_past', 'dribbles',
       'dribbles_completed', 'dribbles_completed_pct', 'dribbles_vs', 'errors',
       'gca', 'minutes', 'miscontrols', 'npxg', 'nutmegs', 'pass_targets',
       'passes', 'passes_blocked', 'passes_completed', 'passes_completed_long',
       'passes_completed_medium', 'passes_completed_short', 'passes_dead',
       'passes_free_kicks', 'passes_ground', 'passes_head', 'passes_high',
       'passes_intercepted', 'passes_into_final_third',
       'passes_into_penalty_area', 'passes_left_foot', 'passes_live',
       'passes_long', 'passes_low', 'passes_medium', 'passes_offsides',
       'passes_oob', 'passes_other_body', 'passes_pct', 'passes_pct_long',
       'passes_pct_medium', 'passes_pct_short', 'passes_pressure',
       'passes_progressive_distance', 'passes_received', 'passes_received_pct',
       'passes_right_foot', 'passes_short', 'passes_switches',
       'passes_total_distance', 'pens_conceded', 'pens_won',
       'players_dribbled_past', 'pressure_regain_pct', 'pressure_regains',
       'pressures', 'pressures_att_3rd', 'pressures_def_3rd',
       'pressures_mid_3rd', 'progressive_carries', 'progressive_passes',
       'progressive_passes_received', 'sca', 'tackles', 'tackles_att_3rd',
       'tackles_def_3rd', 'tackles_interceptions', 'tackles_mid_3rd',
       'through_balls', 'throw_ins', 'touches', 'touches_att_3rd',
       'touches_att_pen_area', 'touches_def_3rd', 'touches_def_pen_area',
       'touches_live_ball', 'touches_mid_3rd', 'xa', 'xg',
       'championship_name'],
      dtype='object')
In [255]:
null_cols_count = df.isnull().any().sum()

print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 99
In [256]:
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False).head(20)
Null counts for each column:
Out[256]:
dribbles_completed_pct         26447
dribble_tackles_pct            23101
aerials_won_pct                15866
passes_pct_long                 7057
pressure_regain_pct             4735
passes_pct_medium               2492
passes_pct_short                2240
pens_won                        2099
pens_conceded                   2099
passes_pct                       610
passes_received_pct              516
championship_name                459
progressive_carries               71
carries_into_final_third          71
carries_into_penalty_area         71
throw_ins                         50
passes_oob                        50
touches_def_pen_area              50
passes_received                   50
passes_progressive_distance       50
dtype: int64
In [257]:
from matplotlib.colors import LinearSegmentedColormap

numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()
fcorr = corr[(corr > 0.5) | (corr < -0.3)]
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["blue", "white", "red"])

# Plot the correlation matrix
plt.figure(figsize=(80, 60))
sns.heatmap(fcorr, annot=True, cmap=cmap, vmin=-1, vmax=1, center=0, fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

We are searching for columns to delete

In [258]:
df.filter(regex='pct').columns
Out[258]:
Index(['aerials_won_pct', 'dribble_tackles_pct', 'dribbles_completed_pct',
       'passes_pct', 'passes_pct_long', 'passes_pct_medium',
       'passes_pct_short', 'passes_received_pct', 'pressure_regain_pct'],
      dtype='object')
In [259]:
df[['aerials_won','aerials_lost','aerials_won_pct']]
Out[259]:
aerials_won aerials_lost aerials_won_pct
0 0.0 0.0 NaN
1 1.0 0.0 100.0
2 1.0 0.0 100.0
3 1.0 0.0 100.0
4 0.0 3.0 0.0
... ... ... ...
53324 0.0 0.0 NaN
53325 1.0 2.0 33.3
53326 0.0 1.0 0.0
53327 0.0 0.0 NaN
53328 1.0 1.0 50.0

53329 rows × 3 columns

Firstly, we delete columns with 'pct' in their names because they represent the percentage of some statistic that is already included in other columns.

In [260]:
col_drop = df.filter(regex='pct').columns
df.drop(columns=col_drop, inplace=True)

Now, we delete columns with high correlation coefficients, as well as those that represent redundant information already captured by other columns

In [261]:
threshold = 0.7
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()

high_correlation_matrix = corr[(abs(corr) > threshold) & (corr != 1.0)]

for column in high_correlation_matrix.columns:
    highest_correlations = high_correlation_matrix[column].dropna()
    if not highest_correlations.empty:
        print(f"Highest correlations for column '{column}':")
        print(highest_correlations)
        print()
Highest correlations for column 'assisted_shots':
sca    0.802385
xa     0.700308
Name: assisted_shots, dtype: float64

Highest correlations for column 'blocked_passes':
blocks    0.895996
Name: blocked_passes, dtype: float64

Highest correlations for column 'blocks':
blocked_passes    0.895996
Name: blocks, dtype: float64

Highest correlations for column 'carries':
carry_distance                0.887823
carry_progressive_distance    0.797673
pass_targets                  0.910050
passes                        0.928569
passes_completed              0.931741
passes_completed_medium       0.845934
passes_completed_short        0.834066
passes_ground                 0.934277
passes_live                   0.950182
passes_medium                 0.862063
passes_received               0.970124
passes_right_foot             0.718032
passes_short                  0.828325
passes_total_distance         0.846310
touches                       0.933470
touches_live_ball             0.949898
touches_mid_3rd               0.881764
Name: carries, dtype: float64

Highest correlations for column 'carries_into_final_third':
progressive_carries    0.700473
Name: carries_into_final_third, dtype: float64

Highest correlations for column 'carry_distance':
carries                       0.887823
carry_progressive_distance    0.949062
pass_targets                  0.789850
passes                        0.821235
passes_completed              0.824671
passes_completed_medium       0.766727
passes_completed_short        0.704101
passes_ground                 0.834417
passes_live                   0.843563
passes_medium                 0.778882
passes_received               0.854270
passes_total_distance         0.765695
progressive_carries           0.750323
touches                       0.821000
touches_live_ball             0.838291
touches_mid_3rd               0.784414
Name: carry_distance, dtype: float64

Highest correlations for column 'carry_progressive_distance':
carries                0.797673
carry_distance         0.949062
pass_targets           0.706686
passes                 0.731715
passes_completed       0.732076
passes_ground          0.741688
passes_live            0.746378
passes_received        0.765064
progressive_carries    0.785896
touches                0.730432
touches_live_ball      0.741195
Name: carry_progressive_distance, dtype: float64

Highest correlations for column 'corner_kicks':
corner_kicks_in     0.743814
corner_kicks_out    0.752653
Name: corner_kicks, dtype: float64

Highest correlations for column 'corner_kicks_in':
corner_kicks    0.743814
Name: corner_kicks_in, dtype: float64

Highest correlations for column 'corner_kicks_out':
corner_kicks    0.752653
Name: corner_kicks_out, dtype: float64

Highest correlations for column 'dribble_tackles':
dribbles_vs    0.700013
Name: dribble_tackles, dtype: float64

Highest correlations for column 'dribbled_past':
dribbles_vs    0.862981
Name: dribbled_past, dtype: float64

Highest correlations for column 'dribbles':
dribbles_completed       0.871465
players_dribbled_past    0.891752
Name: dribbles, dtype: float64

Highest correlations for column 'dribbles_completed':
dribbles                 0.871465
players_dribbled_past    0.979029
Name: dribbles_completed, dtype: float64

Highest correlations for column 'dribbles_vs':
dribble_tackles    0.700013
dribbled_past      0.862981
Name: dribbles_vs, dtype: float64

Highest correlations for column 'interceptions':
tackles_interceptions    0.72691
Name: interceptions, dtype: float64

Highest correlations for column 'minutes':
touches              0.734107
touches_live_ball    0.700780
Name: minutes, dtype: float64

Highest correlations for column 'npxg':
shots_total    0.710451
xg             0.911297
Name: npxg, dtype: float64

Highest correlations for column 'pass_targets':
carries                       0.910050
carry_distance                0.789850
carry_progressive_distance    0.706686
passes                        0.832753
passes_completed              0.832053
passes_completed_medium       0.729545
passes_completed_short        0.808722
passes_ground                 0.846445
passes_live                   0.867828
passes_medium                 0.755058
passes_received               0.948210
passes_short                  0.821342
passes_total_distance         0.714773
touches                       0.856298
touches_live_ball             0.885329
touches_mid_3rd               0.833050
Name: pass_targets, dtype: float64

Highest correlations for column 'passes':
carries                        0.928569
carry_distance                 0.821235
carry_progressive_distance     0.731715
pass_targets                   0.832753
passes_completed               0.979099
passes_completed_medium        0.901123
passes_completed_short         0.819201
passes_ground                  0.934716
passes_live                    0.980689
passes_medium                  0.921942
passes_progressive_distance    0.731606
passes_received                0.932947
passes_right_foot              0.743326
passes_short                   0.811142
passes_total_distance          0.934619
touches                        0.981020
touches_live_ball              0.959676
touches_mid_3rd                0.862307
Name: passes, dtype: float64

Highest correlations for column 'passes_completed':
carries                       0.931741
carry_distance                0.824671
carry_progressive_distance    0.732076
pass_targets                  0.832053
passes                        0.979099
passes_completed_medium       0.935288
passes_completed_short        0.829897
passes_ground                 0.969800
passes_live                   0.980667
passes_medium                 0.936646
passes_received               0.941563
passes_right_foot             0.750939
passes_short                  0.806748
passes_total_distance         0.946188
touches                       0.954476
touches_live_ball             0.951318
touches_mid_3rd               0.872481
Name: passes_completed, dtype: float64

Highest correlations for column 'passes_completed_long':
passes_long                    0.894553
passes_progressive_distance    0.806589
passes_total_distance          0.852206
touches_def_3rd                0.710982
Name: passes_completed_long, dtype: float64

Highest correlations for column 'passes_completed_medium':
carries                  0.845934
carry_distance           0.766727
pass_targets             0.729545
passes                   0.901123
passes_completed         0.935288
passes_ground            0.919530
passes_live              0.913522
passes_medium            0.986425
passes_received          0.858815
passes_right_foot        0.700535
passes_total_distance    0.919726
touches                  0.871046
touches_live_ball        0.877326
touches_mid_3rd          0.796076
Name: passes_completed_medium, dtype: float64

Highest correlations for column 'passes_completed_short':
carries              0.834066
carry_distance       0.704101
pass_targets         0.808722
passes               0.819201
passes_completed     0.829897
passes_ground        0.817362
passes_live          0.823629
passes_received      0.842823
passes_short         0.986980
touches              0.820249
touches_live_ball    0.821731
touches_mid_3rd      0.811754
Name: passes_completed_short, dtype: float64

Highest correlations for column 'passes_dead':
passes_high    0.712027
Name: passes_dead, dtype: float64

Highest correlations for column 'passes_ground':
carries                       0.934277
carry_distance                0.834417
carry_progressive_distance    0.741688
pass_targets                  0.846445
passes                        0.934716
passes_completed              0.969800
passes_completed_medium       0.919530
passes_completed_short        0.817362
passes_live                   0.963378
passes_medium                 0.915410
passes_received               0.946617
passes_right_foot             0.746277
passes_short                  0.796345
passes_total_distance         0.894980
touches                       0.911640
touches_live_ball             0.932541
touches_mid_3rd               0.864728
Name: passes_ground, dtype: float64

Highest correlations for column 'passes_high':
passes_dead                    0.712027
passes_long                    0.816898
passes_progressive_distance    0.741458
Name: passes_high, dtype: float64

Highest correlations for column 'passes_into_final_third':
progressive_passes    0.72586
touches_mid_3rd       0.73174
Name: passes_into_final_third, dtype: float64

Highest correlations for column 'passes_live':
carries                       0.950182
carry_distance                0.843563
carry_progressive_distance    0.746378
pass_targets                  0.867828
passes                        0.980689
passes_completed              0.980667
passes_completed_medium       0.913522
passes_completed_short        0.823629
passes_ground                 0.963378
passes_medium                 0.928450
passes_received               0.962516
passes_right_foot             0.757484
passes_short                  0.816100
passes_total_distance         0.920206
touches                       0.966149
touches_live_ball             0.978919
touches_mid_3rd               0.894138
Name: passes_live, dtype: float64

Highest correlations for column 'passes_long':
passes_completed_long          0.894553
passes_high                    0.816898
passes_progressive_distance    0.838354
passes_total_distance          0.746593
touches_def_3rd                0.742694
Name: passes_long, dtype: float64

Highest correlations for column 'passes_medium':
carries                    0.862063
carry_distance             0.778882
pass_targets               0.755058
passes                     0.921942
passes_completed           0.936646
passes_completed_medium    0.986425
passes_ground              0.915410
passes_live                0.928450
passes_received            0.873890
passes_total_distance      0.906930
touches                    0.899698
touches_live_ball          0.901190
touches_mid_3rd            0.823866
Name: passes_medium, dtype: float64

Highest correlations for column 'passes_other_body':
touches_def_pen_area    0.738015
Name: passes_other_body, dtype: float64

Highest correlations for column 'passes_progressive_distance':
passes                   0.731606
passes_completed_long    0.806589
passes_high              0.741458
passes_long              0.838354
passes_total_distance    0.827215
touches_def_3rd          0.795014
Name: passes_progressive_distance, dtype: float64

Highest correlations for column 'passes_received':
carries                       0.970124
carry_distance                0.854270
carry_progressive_distance    0.765064
pass_targets                  0.948210
passes                        0.932947
passes_completed              0.941563
passes_completed_medium       0.858815
passes_completed_short        0.842823
passes_ground                 0.946617
passes_live                   0.962516
passes_medium                 0.873890
passes_right_foot             0.726258
passes_short                  0.838963
passes_total_distance         0.850692
touches                       0.933362
touches_live_ball             0.956220
touches_mid_3rd               0.890305
Name: passes_received, dtype: float64

Highest correlations for column 'passes_right_foot':
carries                    0.718032
passes                     0.743326
passes_completed           0.750939
passes_completed_medium    0.700535
passes_ground              0.746277
passes_live                0.757484
passes_received            0.726258
passes_total_distance      0.736669
touches                    0.723090
touches_live_ball          0.731926
Name: passes_right_foot, dtype: float64

Highest correlations for column 'passes_short':
carries                   0.828325
pass_targets              0.821342
passes                    0.811142
passes_completed          0.806748
passes_completed_short    0.986980
passes_ground             0.796345
passes_live               0.816100
passes_received           0.838963
touches                   0.819769
touches_live_ball         0.822157
touches_mid_3rd           0.814326
Name: passes_short, dtype: float64

Highest correlations for column 'passes_total_distance':
carries                        0.846310
carry_distance                 0.765695
pass_targets                   0.714773
passes                         0.934619
passes_completed               0.946188
passes_completed_long          0.852206
passes_completed_medium        0.919726
passes_ground                  0.894980
passes_live                    0.920206
passes_long                    0.746593
passes_medium                  0.906930
passes_progressive_distance    0.827215
passes_received                0.850692
passes_right_foot              0.736669
touches                        0.895703
touches_def_3rd                0.731744
touches_live_ball              0.877680
touches_mid_3rd                0.757201
Name: passes_total_distance, dtype: float64

Highest correlations for column 'pens_att':
pens_made    0.898519
Name: pens_att, dtype: float64

Highest correlations for column 'pens_made':
pens_att    0.898519
Name: pens_made, dtype: float64

Highest correlations for column 'players_dribbled_past':
dribbles              0.891752
dribbles_completed    0.979029
Name: players_dribbled_past, dtype: float64

Highest correlations for column 'pressure_regains':
pressures    0.787829
Name: pressure_regains, dtype: float64

Highest correlations for column 'pressures':
pressure_regains     0.787829
pressures_mid_3rd    0.886289
Name: pressures, dtype: float64

Highest correlations for column 'pressures_mid_3rd':
pressures    0.886289
Name: pressures_mid_3rd, dtype: float64

Highest correlations for column 'progressive_carries':
carries_into_final_third      0.700473
carry_distance                0.750323
carry_progressive_distance    0.785896
Name: progressive_carries, dtype: float64

Highest correlations for column 'progressive_passes':
passes_into_final_third    0.725860
touches_mid_3rd            0.701927
Name: progressive_passes, dtype: float64

Highest correlations for column 'progressive_passes_received':
touches_att_pen_area    0.726256
Name: progressive_passes_received, dtype: float64

Highest correlations for column 'sca':
assisted_shots    0.802385
Name: sca, dtype: float64

Highest correlations for column 'shots_total':
npxg    0.710451
Name: shots_total, dtype: float64

Highest correlations for column 'tackles':
tackles_def_3rd          0.790656
tackles_interceptions    0.864330
tackles_won              0.850674
Name: tackles, dtype: float64

Highest correlations for column 'tackles_def_3rd':
tackles                  0.790656
tackles_interceptions    0.701239
Name: tackles_def_3rd, dtype: float64

Highest correlations for column 'tackles_interceptions':
interceptions      0.726910
tackles            0.864330
tackles_def_3rd    0.701239
tackles_won        0.732648
Name: tackles_interceptions, dtype: float64

Highest correlations for column 'tackles_won':
tackles                  0.850674
tackles_interceptions    0.732648
Name: tackles_won, dtype: float64

Highest correlations for column 'touches':
carries                       0.933470
carry_distance                0.821000
carry_progressive_distance    0.730432
minutes                       0.734107
pass_targets                  0.856298
passes                        0.981020
passes_completed              0.954476
passes_completed_medium       0.871046
passes_completed_short        0.820249
passes_ground                 0.911640
passes_live                   0.966149
passes_medium                 0.899698
passes_received               0.933362
passes_right_foot             0.723090
passes_short                  0.819769
passes_total_distance         0.895703
touches_live_ball             0.984702
touches_mid_3rd               0.876376
Name: touches, dtype: float64

Highest correlations for column 'touches_att_pen_area':
progressive_passes_received    0.726256
Name: touches_att_pen_area, dtype: float64

Highest correlations for column 'touches_def_3rd':
passes_completed_long          0.710982
passes_long                    0.742694
passes_progressive_distance    0.795014
passes_total_distance          0.731744
touches_def_pen_area           0.726768
Name: touches_def_3rd, dtype: float64

Highest correlations for column 'touches_def_pen_area':
passes_other_body    0.738015
touches_def_3rd      0.726768
Name: touches_def_pen_area, dtype: float64

Highest correlations for column 'touches_live_ball':
carries                       0.949898
carry_distance                0.838291
carry_progressive_distance    0.741195
minutes                       0.700780
pass_targets                  0.885329
passes                        0.959676
passes_completed              0.951318
passes_completed_medium       0.877326
passes_completed_short        0.821731
passes_ground                 0.932541
passes_live                   0.978919
passes_medium                 0.901190
passes_received               0.956220
passes_right_foot             0.731926
passes_short                  0.822157
passes_total_distance         0.877680
touches                       0.984702
touches_mid_3rd               0.902909
Name: touches_live_ball, dtype: float64

Highest correlations for column 'touches_mid_3rd':
carries                    0.881764
carry_distance             0.784414
pass_targets               0.833050
passes                     0.862307
passes_completed           0.872481
passes_completed_medium    0.796076
passes_completed_short     0.811754
passes_ground              0.864728
passes_into_final_third    0.731740
passes_live                0.894138
passes_medium              0.823866
passes_received            0.890305
passes_short               0.814326
passes_total_distance      0.757201
progressive_passes         0.701927
touches                    0.876376
touches_live_ball          0.902909
Name: touches_mid_3rd, dtype: float64

Highest correlations for column 'xa':
assisted_shots    0.700308
Name: xa, dtype: float64

Highest correlations for column 'xg':
npxg    0.911297
Name: xg, dtype: float64

passes, touches, carries(carr), dribbles(dribble), corner, blocks(block)

In [262]:
passes_columns = df.filter(regex='passes').columns
arr_passes = list()
for col in passes_columns:
    if abs(df[col].corr(df['passes'])) > 0.6:
        arr_passes.append(col)
arr_passes.pop(0)
arr_passes
Out[262]:
['passes_completed',
 'passes_completed_long',
 'passes_completed_medium',
 'passes_completed_short',
 'passes_ground',
 'passes_into_final_third',
 'passes_live',
 'passes_long',
 'passes_low',
 'passes_medium',
 'passes_progressive_distance',
 'passes_received',
 'passes_right_foot',
 'passes_short',
 'passes_total_distance',
 'progressive_passes']
In [263]:
touches_columns = df.filter(regex='touches').columns
arr_touches = list()
for col in touches_columns:
    if abs(df[col].corr(df['touches'])) > 0.6:
        arr_touches.append(col)
arr_touches.pop(0)
arr_touches
Out[263]:
['touches_def_3rd', 'touches_live_ball', 'touches_mid_3rd']
In [264]:
carr_columns = df.filter(regex='carr').columns
arr_carr = list()
for col in carr_columns:
    if abs(df[col].corr(df['carries'])) > 0.6:
        arr_carr.append(col)
arr_carr.pop(0)
arr_carr
Out[264]:
['carry_distance', 'carry_progressive_distance', 'progressive_carries']
In [265]:
dribble_columns = df.filter(regex='dribble').columns
arr_dribble = list()
for col in dribble_columns:
    if abs(df[col].corr(df['dribbles'])) > 0.6:
        arr_dribble.append(col)
arr_dribble.pop(0)
arr_dribble
Out[265]:
['dribbles_completed', 'players_dribbled_past']
In [266]:
corner_columns = df.filter(regex='corner').columns
arr_corner = list()
for col in corner_columns:
    if abs(df[col].corr(df['corner_kicks'])) > 0.6:
        arr_corner.append(col)
arr_corner.pop(0)
arr_corner
Out[266]:
['corner_kicks_in', 'corner_kicks_out']
In [267]:
block_columns = df.filter(regex='block').columns
arr_block = list()
for col in block_columns:
    if abs(df[col].corr(df['blocks'])) > 0.6:
        arr_block.append(col)
arr_block.pop(1)
arr_block
Out[267]:
['blocked_passes']
In [268]:
cols_to_drop = arr_passes+arr_touches+arr_carr+arr_dribble+arr_corner+arr_block
df.drop(columns=cols_to_drop, inplace=True)
In [269]:
threshold = 0.7
numeric_df = df.select_dtypes(include=['number'])
corr = numeric_df.corr()

high_correlation_matrix = corr[(abs(corr) > threshold) & (corr != 1.0)]

for column in high_correlation_matrix.columns:
    highest_correlations = high_correlation_matrix[column].dropna()
    if not highest_correlations.empty:
        print(f"Highest correlations for column '{column}':")
        print(highest_correlations)
        print()
        
Highest correlations for column 'assisted_shots':
sca    0.802385
xa     0.700308
Name: assisted_shots, dtype: float64

Highest correlations for column 'carries':
pass_targets    0.910050
passes          0.928569
touches         0.933470
Name: carries, dtype: float64

Highest correlations for column 'dribble_tackles':
dribbles_vs    0.700013
Name: dribble_tackles, dtype: float64

Highest correlations for column 'dribbled_past':
dribbles_vs    0.862981
Name: dribbled_past, dtype: float64

Highest correlations for column 'dribbles_vs':
dribble_tackles    0.700013
dribbled_past      0.862981
Name: dribbles_vs, dtype: float64

Highest correlations for column 'interceptions':
tackles_interceptions    0.72691
Name: interceptions, dtype: float64

Highest correlations for column 'minutes':
touches    0.734107
Name: minutes, dtype: float64

Highest correlations for column 'npxg':
shots_total    0.710451
xg             0.911297
Name: npxg, dtype: float64

Highest correlations for column 'pass_targets':
carries    0.910050
passes     0.832753
touches    0.856298
Name: pass_targets, dtype: float64

Highest correlations for column 'passes':
carries         0.928569
pass_targets    0.832753
touches         0.981020
Name: passes, dtype: float64

Highest correlations for column 'passes_dead':
passes_high    0.712027
Name: passes_dead, dtype: float64

Highest correlations for column 'passes_high':
passes_dead    0.712027
Name: passes_high, dtype: float64

Highest correlations for column 'passes_other_body':
touches_def_pen_area    0.738015
Name: passes_other_body, dtype: float64

Highest correlations for column 'pens_att':
pens_made    0.898519
Name: pens_att, dtype: float64

Highest correlations for column 'pens_made':
pens_att    0.898519
Name: pens_made, dtype: float64

Highest correlations for column 'pressure_regains':
pressures    0.787829
Name: pressure_regains, dtype: float64

Highest correlations for column 'pressures':
pressure_regains     0.787829
pressures_mid_3rd    0.886289
Name: pressures, dtype: float64

Highest correlations for column 'pressures_mid_3rd':
pressures    0.886289
Name: pressures_mid_3rd, dtype: float64

Highest correlations for column 'progressive_passes_received':
touches_att_pen_area    0.726256
Name: progressive_passes_received, dtype: float64

Highest correlations for column 'sca':
assisted_shots    0.802385
Name: sca, dtype: float64

Highest correlations for column 'shots_total':
npxg    0.710451
Name: shots_total, dtype: float64

Highest correlations for column 'tackles':
tackles_def_3rd          0.790656
tackles_interceptions    0.864330
tackles_won              0.850674
Name: tackles, dtype: float64

Highest correlations for column 'tackles_def_3rd':
tackles                  0.790656
tackles_interceptions    0.701239
Name: tackles_def_3rd, dtype: float64

Highest correlations for column 'tackles_interceptions':
interceptions      0.726910
tackles            0.864330
tackles_def_3rd    0.701239
tackles_won        0.732648
Name: tackles_interceptions, dtype: float64

Highest correlations for column 'tackles_won':
tackles                  0.850674
tackles_interceptions    0.732648
Name: tackles_won, dtype: float64

Highest correlations for column 'touches':
carries         0.933470
minutes         0.734107
pass_targets    0.856298
passes          0.981020
Name: touches, dtype: float64

Highest correlations for column 'touches_att_pen_area':
progressive_passes_received    0.726256
Name: touches_att_pen_area, dtype: float64

Highest correlations for column 'touches_def_pen_area':
passes_other_body    0.738015
Name: touches_def_pen_area, dtype: float64

Highest correlations for column 'xa':
assisted_shots    0.700308
Name: xa, dtype: float64

Highest correlations for column 'xg':
npxg    0.911297
Name: xg, dtype: float64

In [270]:
cols_to_drop2 =  ['xg', 'tackles_def_3rd', 'tackles_won', 'tackles_interceptions', 'npxg', 'sca', 'xa', 'pass_targets', 'dribble_tackles', 'dribbled_past', 'tackles_interceptions', 'pens_made', 'pressure_regains', 'pressures_mid_3rd']

df.drop(columns=cols_to_drop2, inplace=True)

cols_to_drop3 = ['touches','carries']
df.drop(columns=cols_to_drop3, inplace=True)
In [271]:
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False).head(20)
Null counts for each column:
Out[271]:
pens_won                     2099
pens_conceded                2099
championship_name             459
carries_into_penalty_area      71
carries_into_final_third       71
passes_other_body              50
dribbles                       50
dribbles_vs                    50
errors                         50
gca                            50
passes_switches                50
passes_pressure                50
miscontrols                    50
crosses_into_penalty_area      50
nutmegs                        50
passes_oob                     50
passes_offsides                50
passes                         50
passes_blocked                 50
passes_dead                    50
dtype: int64
In [272]:
null_cols_count = df.isnull().any().sum()

print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 50
In [273]:
df[df['passes'].isna()]
Out[273]:
awayScore awayTeamName awayTeamTacticalSchema homeScore homeTeamName homeTeamTacticalSchema matchDate matchWeek aerials_lost aerials_won ... shots_total tackles tackles_att_3rd tackles_mid_3rd through_balls throw_ins touches_att_3rd touches_att_pen_area touches_def_pen_area championship_name
61 1 Valladolid (4-4-2) 4 Real Sociedad (4-4-2) 2021-05-16 37 NaN NaN ... 1 NaN NaN NaN NaN NaN NaN NaN NaN liga
2312 0 Rayo Vallecano (4-2-3-1) 1 Real Sociedad (4-1-4-1) 2021-08-22 2 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
2709 1 Spezia (4-3-3) 2 Lazio (3-5-2) 2021-04-03 29 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
4964 2 Spezia (4-3-3) 2 Parma (4-4-2◆) 2020-10-25 5 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
5378 0 Valladolid (4-4-2) 2 Atlético Madrid (3-4-1-2) 2020-12-05 12 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
5732 2 Alavés (4-3-3) 0 Elche (4-4-2) 2021-05-11 36 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
6520 0 Toulouse (4-1-4-1) 1 Marseille (4-3-3) 2020-02-08 24 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
7975 1 Villarreal (4-4-1-1) 1 Celta Vigo (4-4-2◆) 2021-11-20 14 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
9177 1 Rennes (4-4-2) 0 Nîmes (4-3-3) 2020-01-15 12 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
9558 1 Valladolid (4-4-2) 4 Real Sociedad (4-4-2) 2021-05-16 37 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
10799 2 Genoa (3-5-2) 1 Milan (4-2-3-1) 2020-03-08 26 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
11526 0 Getafe (4-2-2-2) 0 Levante (3-4-1-2) 2019-02-02 22 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
13297 1 Marseille (3-4-3) 1 Nice (4-4-2) 2021-10-27 3 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
13325 1 Marseille (3-4-3) 1 Nice (4-4-2) 2021-10-27 3 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
13959 2 Napoli (4-2-3-1) 0 Fiorentina (3-5-2) 2021-05-16 37 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
15550 1 Valladolid (4-4-2) 1 Sevilla (4-3-3) 2020-12-19 14 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
15605 2 Celta Vigo (4-3-3) 3 Leganés (3-4-3) 2019-12-08 16 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
16643 1 Leicester City (3-4-3) 2 Chelsea (3-4-3) 2021-05-18 37 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN premier league
16746 1 Rayo Vallecano (4-2-3-1) 1 Levante (4-3-3) 2021-09-11 4 NaN NaN ... 4 NaN NaN NaN NaN NaN NaN NaN NaN liga
19130 0 Levante (3-5-2) 2 Celta Vigo (4-4-2◆) 2021-04-30 34 NaN NaN ... 2 NaN NaN NaN NaN NaN NaN NaN NaN liga
22168 1 Stuttgart (3-5-2) 1 Mönchengladbach (3-4-3) 2021-10-16 8 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN bundesliga
23217 1 Nürnberg (4-2-2-2) 2 Mainz 05 (4-4-2◆) 2019-01-26 19 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN bundesliga
24577 2 Spezia (4-4-2) 2 Sampdoria (4-4-2) 2021-05-12 36 NaN NaN ... 1 NaN NaN NaN NaN NaN NaN NaN NaN seria a
26189 1 Internazionale (4-2-3-1) 2 Cagliari (4-4-2◆) 2019-03-01 26 NaN NaN ... 10 NaN NaN NaN NaN NaN NaN NaN NaN seria a
27603 0 Real Betis (4-1-4-1) 1 Getafe (4-4-2) 2020-01-26 21 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
28068 1 Valladolid (4-4-2) 1 Elche (4-4-2) 2021-04-21 31 NaN NaN ... 3 NaN NaN NaN NaN NaN NaN NaN NaN liga
28831 1 Spezia (4-3-3) 4 Crotone (3-5-2) 2020-12-12 11 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
30267 1 Marseille (3-4-3) 1 Nice (4-4-2) 2021-10-27 3 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
30975 0 Osasuna (4-5-1) 2 Real Madrid (4-2-3-1) 2021-05-01 34 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
31110 2 Levante (4-4-2) 2 Alavés (4-2-3-1) 2021-05-08 35 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
33383 0 Espanyol (4-1-4-1) 1 Rayo Vallecano (4-2-3-1) 2021-12-05 16 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
35395 1 Marseille (3-4-3) 1 Nice (4-4-2) 2021-10-27 3 NaN NaN ... 2 NaN NaN NaN NaN NaN NaN NaN NaN ligue 1
35886 3 Internazionale (3-5-2) 1 Napoli (4-3-3) 2020-01-06 18 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
35907 2 Spezia (4-3-3) 2 Cagliari (4-2-3-1) 2020-11-29 9 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
36248 1 Spezia (3-4-3) 6 Lazio (4-3-3) 2021-08-28 2 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
37849 2 Rayo Vallecano (4-2-3-1) 1 Athletic Club (4-4-2) 2021-09-21 6 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
39549 0 Real Betis (4-2-3-1) 1 Atlético Madrid (4-4-2) 2020-07-11 36 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
40371 1 Valladolid (4-4-2) 1 Eibar (4-4-2) 2021-02-13 23 NaN NaN ... 1 NaN NaN NaN NaN NaN NaN NaN NaN liga
43149 0 Internazionale (3-5-2) 2 Juventus (4-3-3) 2020-03-08 26 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
43296 0 Celta Vigo (4-3-3) 0 Granada (3-4-3) 2020-02-29 26 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
43672 3 Eintracht Frankfurt (3-4-1-2) 0 Hannover 96 (3-4-1-2) 2019-02-24 23 NaN NaN ... 1 NaN NaN NaN NaN NaN NaN NaN NaN bundesliga
43698 2 Milan (4-3-3) 1 Genoa (3-4-1-2) 2019-10-05 7 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
44826 0 Espanyol (3-5-2) 1 Barcelona (4-4-2◆) 2020-07-08 35 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
45483 1 Real Betis (4-1-4-1) 1 Eibar (4-2-3-1) 2020-02-02 22 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
45986 2 Atlético Madrid (3-5-2) 1 Valladolid (4-4-2) 2021-05-22 38 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
46276 2 Mallorca (4-1-4-1) 4 Getafe (4-4-2) 2019-09-22 5 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
47642 1 Levante (4-4-2) 2 Getafe (4-4-2) 2021-05-16 37 NaN NaN ... 1 NaN NaN NaN NaN NaN NaN NaN NaN liga
49736 1 Empoli (3-5-2) 2 Internazionale (4-2-3-1) 2019-05-26 38 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN seria a
51827 1 Barcelona (3-4-3) 2 Real Madrid (4-1-4-1) 2021-04-10 30 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga
51934 2 Levante (4-4-2) 4 Valencia (4-4-2) 2020-09-13 1 NaN NaN ... 0 NaN NaN NaN NaN NaN NaN NaN NaN liga

50 rows × 73 columns

It turns out that all 50 rows that contains missing values are matches with no given statistics

In [274]:
df.dropna(subset=['passes'],inplace=True)
In [275]:
null_cols_count = df.isnull().any().sum()

print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 6
In [276]:
print("Null counts for each column:")
df.isnull().sum().sort_values(ascending=False)
Null counts for each column:
Out[276]:
pens_won                     2053
pens_conceded                2053
championship_name             459
carries_into_final_third       21
carries_into_penalty_area      21
                             ... 
goals                           0
interceptions                   0
awayTeamName                    0
miscontrols                     0
awayScore                       0
Length: 73, dtype: int64
In [301]:
df['pens_won'].fillna(0, inplace=True)
df['pens_conceded'].fillna(0, inplace=True)
df['minutes'].fillna(90, inplace=True)
df['carries_into_final_third'].fillna(0, inplace=True)
df['carries_into_penalty_area'].fillna(0, inplace=True)

df['championship_name'].fillna('unknown', inplace=True)
In [303]:
null_cols_count = df.isnull().any().sum()

print("Number of columns with null values:", null_cols_count)
Number of columns with null values: 0

0 NULLs left - 1.2 done:)

1.3 Encoding of categorical values¶

Here we know from EDA that we have some categorical columns that have a lot of different unstructured values - we need to think about what to do with this

In [304]:
cat_cols = df.select_dtypes(include='object').columns

print(f'categorical columns: {cat_cols}')
print(f'number of categorical columns: {len(cat_cols)}')
categorical columns: Index(['awayTeamName', 'awayTeamTacticalSchema', 'homeTeamName',
       'homeTeamTacticalSchema', 'matchDate', 'championship_name'],
      dtype='object')
number of categorical columns: 6
In [305]:
unique_values_count = df[cat_cols].nunique()
print(unique_values_count)
awayTeamName              129
awayTeamTacticalSchema     32
homeTeamName              122
homeTeamTacticalSchema     32
matchDate                 565
championship_name           6
dtype: int64
In [306]:
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)'
 '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-2◆)' '(4-4-1-1)' '(4-2-2-2)'
 '(3-5-1-1)' '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)'
 '(5-3-2)' '(5-4-1)' '(3-2-3-2)' '(4-2-3-1◆)' '(4-3-2-1)' '(3-1-4-1-1)'
 '(4-2-2-1-1)' '(4-3-1-2)' '(3-1-4-2)' '(3-4-3◆)' '(3-3-2-2)' '(3-2-4-1)'
 '(3-3-2-1-1)' 'United' 'Marseille']

We have some incorrect values here, let's change/remove them.

In [307]:
df[df['awayTeamTacticalSchema']=='United']
Out[307]:
awayScore awayTeamName awayTeamTacticalSchema homeScore homeTeamName homeTeamTacticalSchema matchDate matchWeek aerials_lost aerials_won ... shots_total tackles tackles_att_3rd tackles_mid_3rd through_balls throw_ins touches_att_3rd touches_att_pen_area touches_def_pen_area championship_name
4286 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 5.0 4.0 ... 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0 premier league
22856 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 3.0 1.0 ... 3 0.0 0.0 0.0 0.0 0.0 14.0 10.0 1.0 premier league
47938 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 4.0 2.0 ... 0 0.0 0.0 0.0 0.0 3.0 19.0 1.0 2.0 premier league
48081 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 0.0 0.0 ... 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 44.0 premier league
48988 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 0.0 0.0 ... 2 0.0 0.0 0.0 0.0 1.0 7.0 2.0 0.0 premier league
53264 1 Leeds United United 1 Burnley (4-4-2) 2021-08-29 3 3.0 5.0 ... 1 0.0 0.0 0.0 0.0 0.0 5.0 2.0 6.0 premier league

6 rows × 73 columns

In [308]:
df[df['awayTeamTacticalSchema']=='Marseille']
Out[308]:
awayScore awayTeamName awayTeamTacticalSchema homeScore homeTeamName homeTeamTacticalSchema matchDate matchWeek aerials_lost aerials_won ... shots_total tackles tackles_att_3rd tackles_mid_3rd through_balls throw_ins touches_att_3rd touches_att_pen_area touches_def_pen_area championship_name
14150 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 2.0 ... 2 0.0 0.0 0.0 0.0 0.0 10.0 5.0 1.0 ligue 1
15110 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 0.0 ... 1 1.0 0.0 1.0 0.0 1.0 29.0 1.0 0.0 ligue 1
18643 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 2.0 0.0 ... 0 1.0 1.0 0.0 0.0 8.0 18.0 1.0 7.0 ligue 1
19562 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 0.0 ... 0 0.0 0.0 0.0 0.0 0.0 14.0 1.0 0.0 ligue 1
24063 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 0.0 ... 0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0 ligue 1
25665 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 1.0 0.0 ... 2 0.0 0.0 0.0 0.0 0.0 5.0 3.0 0.0 ligue 1
27168 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 1.0 2.0 ... 1 1.0 0.0 0.0 0.0 3.0 13.0 1.0 0.0 ligue 1
33450 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 0.0 ... 0 1.0 0.0 1.0 0.0 6.0 26.0 2.0 0.0 ligue 1
41038 0 Marseille Marseille 2 Lille (4-4-2) 2021-10-03 9 0.0 1.0 ... 0 6.0 0.0 1.0 0.0 0.0 9.0 0.0 9.0 ligue 1

9 rows × 73 columns

In [309]:
#let's remove rows with teamNames as tactical schema
df = df[(df['awayTeamTacticalSchema'] != 'United') & (df['awayTeamTacticalSchema'] != 'Marseille')]
In [310]:
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)'
 '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-2◆)' '(4-4-1-1)' '(4-2-2-2)'
 '(3-5-1-1)' '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)'
 '(5-3-2)' '(5-4-1)' '(3-2-3-2)' '(4-2-3-1◆)' '(4-3-2-1)' '(3-1-4-1-1)'
 '(4-2-2-1-1)' '(4-3-1-2)' '(3-1-4-2)' '(3-4-3◆)' '(3-3-2-2)' '(3-2-4-1)'
 '(3-3-2-1-1)']
In [311]:
df.loc[df['awayTeamTacticalSchema'] == '(4-4-2◆)', 'awayTeamTacticalSchema'] = '(4-4-2)'
df.loc[df['awayTeamTacticalSchema'] == '(4-2-3-1◆)', 'awayTeamTacticalSchema'] = '(4-2-3-1)'
df.loc[df['awayTeamTacticalSchema'] == '(3-4-3◆)', 'awayTeamTacticalSchema'] = '(3-4-3)'
In [312]:
unique_values = df['awayTeamTacticalSchema'].unique()
print(unique_values)
['(4-4-2)' '(4-2-3-1)' '(3-5-2)' '(4-5-1)' '(3-4-1-2)' '(4-1-4-1)'
 '(4-3-3)' '(3-4-3)' '(5-1-2-2)' '(4-4-1-1)' '(4-2-2-2)' '(3-5-1-1)'
 '(3-2-2-2-1)' '(3-2-3-1-1)' '(4-1-3-2)' '(3-2-2-1-2)' '(5-3-2)' '(5-4-1)'
 '(3-2-3-2)' '(4-3-2-1)' '(3-1-4-1-1)' '(4-2-2-1-1)' '(4-3-1-2)'
 '(3-1-4-2)' '(3-3-2-2)' '(3-2-4-1)' '(3-3-2-1-1)']

Now the same for homeTeamTacticalSchema.

In [313]:
unique_values = df['homeTeamTacticalSchema'].unique()
print(unique_values)
['(4-3-3)' '(4-2-3-1)' '(4-4-2)' '(3-5-2)' '(4-1-4-1)' '(4-4-2◆)'
 '(3-4-3)' '(5-1-2-2)' '(3-5-1-1)' '(4-4-1-1)' '(3-4-1-2)' '(4-5-1)'
 '(3-1-4-2)' '(4-2-2-2)' '(5-3-2)' '(4-3-2-1)' '(3-2-4-1)' '(3-2-2-1-2)'
 '(3-3-2-2)' '(3-2-1-2-2)' '(3-4-3◆)' '(3-2-3-1-1)' '(3-2-2-2-1)'
 '(4-1-3-2)' '(3-2-3-2)' '(4-3-1-2)' '(5-4-1)' '(3-1-4-1-1)' '(4-2-2-1-1)'
 '(4-2-3-1◆)' '(3-5-2◆)' '(4-3-3◆)']
In [314]:
df.loc[df['homeTeamTacticalSchema'] == '(4-4-2◆)', 'homeTeamTacticalSchema'] = '(4-4-2)'
df.loc[df['homeTeamTacticalSchema'] == '(4-2-3-1◆)', 'homeTeamTacticalSchema'] = '(4-2-3-1)'
df.loc[df['homeTeamTacticalSchema'] == '(3-4-3◆)', 'homeTeamTacticalSchema'] = '(3-4-3)'
df.loc[df['homeTeamTacticalSchema'] == '(3-5-2◆)', 'homeTeamTacticalSchema'] = '(3-5-2)'
df.loc[df['homeTeamTacticalSchema'] == '(4-3-3◆)', 'homeTeamTacticalSchema'] = '(4-3-3)'
In [315]:
unique_values = df['homeTeamTacticalSchema'].unique()
print(unique_values)
['(4-3-3)' '(4-2-3-1)' '(4-4-2)' '(3-5-2)' '(4-1-4-1)' '(3-4-3)'
 '(5-1-2-2)' '(3-5-1-1)' '(4-4-1-1)' '(3-4-1-2)' '(4-5-1)' '(3-1-4-2)'
 '(4-2-2-2)' '(5-3-2)' '(4-3-2-1)' '(3-2-4-1)' '(3-2-2-1-2)' '(3-3-2-2)'
 '(3-2-1-2-2)' '(3-2-3-1-1)' '(3-2-2-2-1)' '(4-1-3-2)' '(3-2-3-2)'
 '(4-3-1-2)' '(5-4-1)' '(3-1-4-1-1)' '(4-2-2-1-1)']

We will transform these values into 3 columns: defense_schema (nr of players in defense), middle_schema, strikers_schema.

In [316]:
print(df['homeTeamTacticalSchema'].nunique())
27
In [317]:
def extract_numbers(formation):
    nums = formation.strip('()').split('-')
    nums = [int(num) for num in nums]
    while len(nums) < 3:
        nums.append(0)
    return nums

formations_home = [extract_numbers(formation) for formation in df['homeTeamTacticalSchema']]
formations_home
Out[317]:
[[4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [5, 1, 2, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 1, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 1, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [4, 4, 1, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 1, 2],
 [4, 4, 1, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 4, 1, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 5, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 5, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [3, 1, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 2, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [5, 3, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 2, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 1, 4, 2],
 [4, 4, 1, 1],
 [4, 3, 3],
 [4, 5, 1],
 [3, 5, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 1, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 2, 2],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 2, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 2, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 5, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 1, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 2, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 4, 1, 1],
 [5, 3, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [3, 2, 2, 1, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 5, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 1, 2],
 [4, 4, 1, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 3, 2, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 1, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 1, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 2, 2, 1, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [3, 2, 1, 2, 2],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 5, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 2, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 5, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 2, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 2, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 5, 1],
 [3, 5, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 5, 1],
 [4, 3, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [4, 1, 4, 1],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 5, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 2, 3, 1, 1],
 [4, 4, 1, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 5, 1, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 2, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 2, 2, 1, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 2, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 1, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 1, 1],
 [4, 5, 1],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 2, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 2, 2, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 4, 1, 2],
 [3, 5, 1, 1],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 1, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 2, 2, 2, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 2, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 ...]
In [318]:
#count nr of middle players
def sum_middle_values(formation):
    return sum(formation[1:-1])

df['defense_schema_home'] = [form[0] for form in formations_home]
df['middle_schema_home'] = [sum_middle_values(form) for form in formations_home]
df['strikers_schema_home'] = [form[-1] for form in formations_home]
In [319]:
df.drop(columns=['homeTeamTacticalSchema'], inplace=True)
df.head()
Out[319]:
awayScore awayTeamName awayTeamTacticalSchema homeScore homeTeamName matchDate matchWeek aerials_lost aerials_won age ... tackles_mid_3rd through_balls throw_ins touches_att_3rd touches_att_pen_area touches_def_pen_area championship_name defense_schema_home middle_schema_home strikers_schema_home
0 4 Napoli (4-4-2) 1 Spezia 2021-05-08 35 0.0 0.0 24.0 ... 0.0 0.0 0.0 0.0 0.0 42.0 seria a 4 3 3
1 4 Bayern Munich (4-2-3-1) 0 Schalke 04 2021-01-24 18 0.0 1.0 20.0 ... 3.0 0.0 4.0 20.0 0.0 4.0 bundesliga 4 5 1
2 0 Osasuna (3-5-2) 0 Levante 2021-12-05 16 0.0 1.0 23.0 ... 2.0 0.0 0.0 14.0 0.0 3.0 liga 4 3 3
3 2 Crystal Palace (4-5-1) 2 Arsenal 2019-10-27 10 0.0 1.0 28.0 ... 1.0 0.0 0.0 9.0 1.0 6.0 premier league 4 4 2
4 0 Hertha BSC (4-2-3-1) 2 Union Berlin 2021-11-20 12 3.0 0.0 24.0 ... 0.0 0.0 0.0 16.0 2.0 1.0 bundesliga 3 5 2

5 rows × 75 columns

In [320]:
print(df['awayTeamTacticalSchema'].nunique())
27
In [321]:
formations_away = [extract_numbers(formation) for formation in df['awayTeamTacticalSchema']]
formations_away
Out[321]:
[[4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 5, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [5, 1, 2, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 2, 2],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 5, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 5, 1, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 1, 1],
 [3, 4, 1, 2],
 [4, 3, 3],
 [3, 5, 1, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 1, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 1, 2],
 [3, 2, 2, 2, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 2, 3, 1, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 1, 3, 2],
 [4, 1, 4, 1],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 2, 2, 2],
 [4, 4, 1, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 5, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 5, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [3, 2, 2, 1, 2],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 2, 2, 2, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [5, 3, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 2, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 2, 2],
 [4, 5, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 3, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [5, 4, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [5, 1, 2, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [3, 5, 2],
 [5, 1, 2, 2],
 [3, 2, 3, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [5, 3, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 5, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 4, 1, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 2, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 2, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 1, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 2, 3, 1, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 1, 4, 1, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 4, 1, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 2, 2, 1, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 2, 1],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 2, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 3, 3],
 [3, 4, 1, 2],
 [3, 4, 3],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 2, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [3, 2, 3, 1, 1],
 [4, 3, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 4, 1, 2],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 1, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 5, 1, 1],
 [4, 1, 4, 1],
 [5, 3, 2],
 [4, 4, 1, 1],
 [4, 4, 2],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [4, 4, 1, 1],
 [3, 4, 3],
 [4, 3, 3],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 5, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 5, 1],
 [5, 3, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [4, 2, 2, 1, 1],
 [4, 5, 1],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [4, 3, 1, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 2, 2, 2],
 [3, 5, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 5, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [3, 4, 3],
 [4, 1, 4, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 5, 1, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 5, 1],
 [4, 3, 2, 1],
 [3, 4, 3],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 2, 2],
 [3, 5, 2],
 [3, 4, 1, 2],
 [4, 2, 3, 1],
 [4, 3, 3],
 [4, 4, 2],
 [4, 4, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 1, 1],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [4, 1, 4, 1],
 [4, 3, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 5, 1, 1],
 [3, 5, 2],
 [4, 3, 3],
 [3, 5, 2],
 [4, 1, 4, 1],
 [3, 5, 2],
 [4, 4, 2],
 [3, 1, 4, 2],
 [3, 4, 3],
 [4, 2, 2, 2],
 [3, 4, 3],
 [4, 3, 3],
 [4, 4, 2],
 [4, 1, 4, 1],
 [3, 4, 3],
 [3, 4, 3],
 [4, 1, 4, 1],
 [3, 4, 1, 2],
 [3, 5, 2],
 [4, 2, 3, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 3, 3],
 [3, 5, 2],
 [3, 4, 3],
 [3, 4, 3],
 [4, 3, 3],
 [4, 5, 1],
 [4, 4, 2],
 [3, 2, 2, 1, 2],
 [3, 4, 3],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [3, 4, 3],
 [3, 5, 2],
 [4, 4, 2],
 [4, 2, 3, 1],
 [3, 5, 2],
 [3, 5, 2],
 [3, 5, 2],
 [3, 4, 3],
 [4, 4, 2],
 [3, 4, 3],
 [4, 4, 2],
 [4, 3, 2, 1],
 [4, 4, 2],
 [4, 3, 3],
 [4, 2, 3, 1],
 [5, 3, 2],
 [4, 5, 1],
 [3, 4, 3],
 [4, 4, 1, 1],
 [4, 2, 3, 1],
 [3, 5, 2],
 [4, 2, 2, 2],
 [4, 2, 3, 1],
 [3, 4, 3],
 [3, 5, 2],
 [3, 2, 2, 2, 1],
 [4, 4, 2],
 [4, 2, 3, 1],
 [4, 1, 4, 1],
 [4, 2, 3, 1],
 [4, 4, 2],
 [4, 4, 2],
 [4, 3, 3],
 [4, 3, 3],
 ...]
In [322]:
df['defense_schema_away'] = [form[0] for form in formations_away]
df['middle_schema_away'] = [sum_middle_values(form) for form in formations_away]
df['strikers_schema_away'] = [form[-1] for form in formations_away]
In [323]:
df.drop(columns=['awayTeamTacticalSchema'], inplace=True)
df.head()
Out[323]:
awayScore awayTeamName homeScore homeTeamName matchDate matchWeek aerials_lost aerials_won age assisted_shots ... touches_att_3rd touches_att_pen_area touches_def_pen_area championship_name defense_schema_home middle_schema_home strikers_schema_home defense_schema_away middle_schema_away strikers_schema_away
0 4 Napoli 1 Spezia 2021-05-08 35 0.0 0.0 24.0 0.0 ... 0.0 0.0 42.0 seria a 4 3 3 4 4 2
1 4 Bayern Munich 0 Schalke 04 2021-01-24 18 0.0 1.0 20.0 0.0 ... 20.0 0.0 4.0 bundesliga 4 5 1 4 5 1
2 0 Osasuna 0 Levante 2021-12-05 16 0.0 1.0 23.0 1.0 ... 14.0 0.0 3.0 liga 4 3 3 3 5 2
3 2 Crystal Palace 2 Arsenal 2019-10-27 10 0.0 1.0 28.0 0.0 ... 9.0 1.0 6.0 premier league 4 4 2 4 5 1
4 0 Hertha BSC 2 Union Berlin 2021-11-20 12 3.0 0.0 24.0 0.0 ... 16.0 2.0 1.0 bundesliga 3 5 2 4 5 1

5 rows × 77 columns

Now let's transform matchDate as in EDA: into year and month.

In [324]:
df['matchYear'] = pd.to_datetime(df['matchDate']).dt.year
df['matchMonth'] = pd.to_datetime(df['matchDate']).dt.month

df.drop(columns=['matchDate'], inplace=True)
df.head()
Out[324]:
awayScore awayTeamName homeScore homeTeamName matchWeek aerials_lost aerials_won age assisted_shots assists ... touches_def_pen_area championship_name defense_schema_home middle_schema_home strikers_schema_home defense_schema_away middle_schema_away strikers_schema_away matchYear matchMonth
0 4 Napoli 1 Spezia 35 0.0 0.0 24.0 0.0 0 ... 42.0 seria a 4 3 3 4 4 2 2021 5
1 4 Bayern Munich 0 Schalke 04 18 0.0 1.0 20.0 0.0 0 ... 4.0 bundesliga 4 5 1 4 5 1 2021 1
2 0 Osasuna 0 Levante 16 0.0 1.0 23.0 1.0 0 ... 3.0 liga 4 3 3 3 5 2 2021 12
3 2 Crystal Palace 2 Arsenal 10 0.0 1.0 28.0 0.0 0 ... 6.0 premier league 4 4 2 4 5 1 2019 10
4 0 Hertha BSC 2 Union Berlin 12 3.0 0.0 24.0 0.0 0 ... 1.0 bundesliga 3 5 2 4 5 1 2021 11

5 rows × 78 columns

In [325]:
cat_cols = df.select_dtypes(include='object').columns
unique_values_count = df[cat_cols].nunique()
print(unique_values_count)
awayTeamName         129
homeTeamName         122
championship_name      6
dtype: int64

We will remove team names, as they are not sequential, so we cannot use label encoding and there are too many values to use hasher without collisions. besides, the name of a team should not influence our assessment of how someone performed in a match.

In [326]:
df.drop(columns=['awayTeamName'], inplace=True)
df.drop(columns=['homeTeamName'], inplace=True)

For championship name we can use one hot encoding, as there are not so many different values.

In [327]:
df.reset_index(drop=True, inplace=True)
In [328]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

transformed_data = ohe.fit_transform(df[['championship_name']])

df_encoded = pd.DataFrame(transformed_data.toarray(), columns=ohe.get_feature_names_out(['championship_name']))
df.drop(columns=['championship_name'], inplace=True)
df = df.join(df_encoded)
In [329]:
df.head()
Out[329]:
awayScore homeScore matchWeek aerials_lost aerials_won age assisted_shots assists ball_recoveries blocked_shots ... middle_schema_away strikers_schema_away matchYear matchMonth championship_name_bundesliga championship_name_liga championship_name_ligue 1 championship_name_premier league championship_name_seria a championship_name_unknown
0 4 1 35 0.0 0.0 24.0 0.0 0 1.0 0.0 ... 4 2 2021 5 0.0 0.0 0.0 0.0 1.0 0.0
1 4 0 18 0.0 1.0 20.0 0.0 0 11.0 0.0 ... 5 1 2021 1 1.0 0.0 0.0 0.0 0.0 0.0
2 0 0 16 0.0 1.0 23.0 1.0 0 10.0 0.0 ... 5 2 2021 12 0.0 1.0 0.0 0.0 0.0 0.0
3 2 2 10 0.0 1.0 28.0 0.0 0 16.0 0.0 ... 5 1 2019 10 0.0 0.0 0.0 1.0 0.0 0.0
4 0 2 12 3.0 0.0 24.0 0.0 0 6.0 0.0 ... 5 1 2021 11 1.0 0.0 0.0 0.0 0.0 0.0

5 rows × 81 columns

Let's check if we didn't create any NULL values by mistake.

In [330]:
null_counts = df.isnull().sum()
print("Null counts for each column:")
print(null_counts[null_counts>0])
Null counts for each column:
Series([], dtype: int64)
In [331]:
cat_cols = df.select_dtypes(include='object').columns

print(f'categorical columns: {cat_cols}')
print(f'number of categorical columns: {len(cat_cols)}')
categorical columns: Index([], dtype='object')
number of categorical columns: 0

No categorical values - 1.3 done:)

1.4 Outliers¶

Automatic outlier detection using knn from pyod

In [333]:
from pyod.models.knn import KNN
clf = KNN(contamination=0.04)
clf.fit(df)
df['outliers'] = clf.labels_
df['outliers'].value_counts()
Out[333]:
0    51148
1     2116
Name: outliers, dtype: int64
In [334]:
df = df[df['outliers']==0]
df.drop(columns=['outliers'], inplace=True)
In [338]:
fig, axs = plt.subplots(21, 4, figsize = (24, 108))
axs = axs.flatten()

for i, col in enumerate(df.columns):
    sns.boxplot(data=df, x=col, ax=axs[i])

plt.tight_layout()
plt.show()

It didn't remove rare but important values such as a lot of goals. It did what we wanted. 1.4 done

In [339]:
df.to_csv('../data/for_modellers/after_feature_engineering.csv', index=False)
In [376]:
df=pd.read_csv('../data/for_modellers/after_feature_engineering.csv')

1.5 Data transformation and scaling¶

In [360]:
df['matchMonth'] = df["matchMonth"].apply(lambda x: np.sin(x * (2 * np.pi / 12)))

df['matchYear'] = df['matchYear'].replace({2019: 0, 2020: 1, 2021: 2})
In [364]:
'''
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import StandardScaler

pt = PowerTransformer(method='yeo-johnson')
scaler = StandardScaler()


df_transformed = pt.fit_transform(df)

df_scaled = scaler.fit_transform(df_transformed)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
'''
In [378]:
from sklearn.preprocessing import  MinMaxScaler

scaler = MinMaxScaler()


df_scaled = scaler.fit_transform(df)


df_scaled = pd.DataFrame(df_scaled, columns=df.columns)
In [380]:
df_scaled.to_csv('../data/for_modellers/after_scaling.csv', index=False)
In [7]:
df=pd.read_csv('../data/for_modellers/after_scaling.csv')

2. Pre-modelling¶

In [4]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score 
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
seed=42
In [465]:
def count_wcss_scores(X, k_max):
    scores = []
    for k in range(1, k_max+1):
        kmeans = KMeans(n_clusters=k, random_state=seed)
        kmeans.fit(X)
        wcss = kmeans.score(X) * -1
        scores.append(wcss)
    return scores
In [466]:
wcss_vec = count_wcss_scores(df, 15)
x_ticks = list(range(1, len(wcss_vec) + 1))
plt.plot(x_ticks, wcss_vec, 'x-', color = "brown")
plt.xlabel('k')
plt.ylabel('Within-cluster sum of squares')
plt.title('The Elbow Method showing the optimal k')
plt.grid()
plt.show()
In [392]:
from sklearn.metrics import silhouette_score
def count_clustering_scores(X, cluster_num, model, score_fun):
    if isinstance(cluster_num, int):
        cluster_num_iter = [cluster_num]
    else:
        cluster_num_iter = cluster_num
        
    scores = []    
    for k in cluster_num_iter:
        model_instance = model(n_clusters=k, random_state = seed)
        labels = model_instance.fit_predict(X)
        wcss = score_fun(X, labels)
        scores.append(wcss)
    
    if isinstance(cluster_num, int):
        return scores[0]
    else:
        return scores
In [393]:
cluster_num_seq = range(2, 20)
silhouette_vec = count_clustering_scores(df, cluster_num_seq, KMeans, silhouette_score)
plt.plot(cluster_num_seq, silhouette_vec, 'x-', color = "brown")
plt.xlabel('k')
plt.ylabel('Silhouette score')
plt.grid()
plt.show()
In [13]:
def metrics_plots(max_k=10):

    score = []

    score_kmeans_c = []
    score_kmeans_d = []

    for k in range(2, max_k):
        kmeans = KMeans(n_clusters=k, random_state= 101)
        predictions = kmeans.fit_predict(df)
        score.append(kmeans.score(df))
        score_kmeans_c.append(calinski_harabasz_score(df, kmeans.labels_))
        score_kmeans_d.append(davies_bouldin_score(df, predictions))

    list_scores = [score, score_kmeans_c, score_kmeans_d] 
    list_title = ['Within-cluster sum of squares', 'Calinski Harabasz', 'Davies Bouldin'] 
    for i in range(len(list_scores)):
        x_ticks = list(range(2, len(list_scores[i]) + 2))
        plt.plot(x_ticks, list_scores[i], 'bx-')
        plt.xlabel('k')
        plt.ylabel(list_title[i])
        plt.title('Optimal k')
        plt.show()
In [14]:
metrics_plots(10)
In [15]:
from sklearn.decomposition import PCA
import matplotlib.cm as cm
pca = PCA()
result = pca.fit_transform(df)
In [19]:
kmeans = KMeans(n_clusters=5, random_state=seed)
labels = kmeans.fit_predict(df)
sns.scatterplot(x = result[:, 0], y = result[:, 1], hue = labels[:], palette=sns.color_palette("hls", 5))
plt.show()
In [467]:
from pca import pca
# Or reduce the data towards 2 PCs
pca_2 = pca(n_components=7)
# Fit transform
results = pca_2.fit_transform(df)
# Plot explained variance
fig, ax = pca_2.plot()

# Scatter first 2 PCs
fig, ax = pca_2.scatter()

# Make biplot with the number of features
fig, ax = pca_2.biplot(n_feat=7)
[pca] >Extracting column labels from dataframe.
[pca] >Extracting row labels from dataframe.
[pca] >The PCA reduction is performed on the [81] columns of the input dataframe.
[pca] >Fit using PCA.
[pca] >Compute loadings and PCs.
[pca] >Compute explained variance.
[pca] >Outlier detection using Hotelling T2 test with alpha=[0.05] and n_components=[7]
[pca] >Multiple test correction applied for Hotelling T2 test: [fdr_bh]
[pca] >Outlier detection using SPE/DmodX with n_std=[3]
[scatterd] >INFO> Create scatterplot
[scatterd] >INFO> Create scatterplot
[scatterd]> WARNING use the standardized verbose status. The status [1-6] will be deprecated in future versions.
[pca] >Plot PC1 vs PC2 with loadings.
[scatterd]> WARNING use the standardized verbose status. The status [1-6] will be deprecated in future versions.
In [32]:
df['label'] = labels
a = df[df['championship_name_bundesliga']==1].shape
b = df[df['championship_name_bundesliga']==1][df['label']==0].shape
(a,b)
C:\Users\micha\AppData\Local\Temp\ipykernel_21692\3823215361.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  b = df[df['championship_name_bundesliga']==1][df['label']==0].shape
Out[32]:
((8717, 82), (8717, 82))
In [29]:
df['label'] = labels
a = df[df['championship_name_seria a']==1].shape
b = df[df['championship_name_seria a']==1][df['label']==2].shape
(a,b)
C:\Users\micha\AppData\Local\Temp\ipykernel_21692\2680441018.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  b = df[df['championship_name_seria a']==1][df['label']==2].shape
Out[29]:
((10937, 82), (10937, 82))
In [35]:
df['label'] = labels
a = df[df['championship_name_liga']==1].shape
b = df[df['championship_name_liga']==1][df['label']==4].shape
(a,b)
C:\Users\micha\AppData\Local\Temp\ipykernel_21692\3656188614.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  b = df[df['championship_name_liga']==1][df['label']==4].shape
Out[35]:
((11255, 82), (11255, 82))
In [37]:
df['label'] = labels
a = df[df['championship_name_premier league']==1].shape
b = df[df['championship_name_premier league']==1][df['label']==3].shape
(a,b)
C:\Users\micha\AppData\Local\Temp\ipykernel_21692\121051250.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  b = df[df['championship_name_premier league']==1][df['label']==3].shape
Out[37]:
((9564, 82), (9564, 82))
In [38]:
df['label'] = labels
a = df[df['championship_name_ligue 1']==1].shape
b = df[df['championship_name_ligue 1']==1][df['label']==1].shape
(a,b)
C:\Users\micha\AppData\Local\Temp\ipykernel_21692\1793682170.py:3: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  b = df[df['championship_name_ligue 1']==1][df['label']==1].shape
Out[38]:
((10239, 82), (10239, 82))
In [44]:
df[(df['championship_name_unknown']==1)].label.value_counts()
Out[44]:
2    306
3     70
1     39
4     18
0      3
Name: label, dtype: int64
In [481]:
import plotly.express as px
import plotly.io as pio

# Assuming df is your DataFrame
pio.renderers.default = 'iframe'
# Perform PCA
pca = PCA()
result = pca.fit_transform(df)

# Perform K-means clustering
kmeans = KMeans(n_clusters=5, random_state=seed)
labels = kmeans.fit_predict(df)

# Prepare DataFrame for plotting
for_plot = pd.DataFrame({f"{i}": result[:, i] for i in range(result.shape[1])})
for_plot['hue'] = pd.Series(labels).map(lambda x: str(x))

# Create the 3D scatter plot
fig = px.scatter_3d(for_plot, x='0', y='1', z='2', color='hue', category_orders={'hue': [str(i) for i in range(5)]})
In [45]:
import os
os.system('jupyter nbconvert --to html feature_engineering.ipynb')
Out[45]:
0